# pip install pypdf

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

file_path = ("./llm_langchain_data/pytorch.pdf")
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
print(pages[0])

# 向量搜索
print("\n\n\n--------------------------------------向量搜索----------------------------------------")
embeddings = HuggingFaceEmbeddings(model_name="D:/models/BAAI_bge-small-zh-v1.5", model_kwargs={'device': 'cpu'})
faiss_index = FAISS.from_documents(pages, embeddings)
docs = faiss_index.similarity_search("What is PyTorch?", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])


# 提取图片
print("\n\n\n--------------------------------------提取第8页的图片----------------------------------------")
#pip install rapidocr-onnxruntime
image_loader = PyPDFLoader(file_path, extract_images=True)
image_pages = image_loader.load()
#识别第8页图片文字
print(image_pages[7].page_content)